home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Amiga Tools 4
/
Amiga Tools 4.iso
/
grafix
/
tools
/
amipeg_0.4
/
sjrevdct.s
< prev
next >
Wrap
Text File
|
1994-04-22
|
13KB
|
782 lines
;
; This code implements the basic idct on a 8x8 pixel block.
; Basically, it's the same as in the JPEG engine, with the sole difference
; that it's inlined and register-wise a little bit more optimized there.
;
; This is a complete rewrite in assembler. Heavy stuff. Lotsa work.
;
; Michael Rausch 14-4-94 1:14:00
;
;
; The whole code handles D-Frames not very well, but I'll fix it on day.
;
DCTSIZE EQU 8
PASS1_BITS EQU 2
CONST_BITS EQU 13
FIX_0_298631336 EQU 2446 ;1 + $98e 100110001110
_FIX_0_390180644 EQU -3196 ;2 - $c7c
FIX_0_541196100 EQU 4433 ;3 + $1151 u
FIX_0_765366865 EQU 6270 ;4 + $187e u
_FIX_0_899976223 EQU -7373 ;5 - $1ccd
FIX_1_175875602 EQU 9633 ;6 + $25a1
FIX_1_501321110 EQU 12299 ;7 + $300b
_FIX_1_847759065 EQU -15137 ;8 - $3b21 u
_FIX_1_961570560 EQU -16069 ;9 - $3ec5
FIX_2_053119869 EQU 16819 ;10 + $41b3
_FIX_2_562915447 EQU -20995 ;11 - $5203
FIX_3_072711026 EQU 25172 ;12 + $6254
; FIX_1_847759065-FIX_0_765366865 = 2* FIX_0_541196100
; **************************************************************************
jrevdct:
sub.w #16,sp
move.l a0,-(sp)
lea compose1(pc),a5
moveq #DCTSIZE-1,d7
idct1: move.l d7,-(sp)
lea 2(a0),a1
move.l (a1)+,d2
move.l d2,d0
move.l (a1)+,d4
move.l (a1)+,d3
or.l d4,d0
or.w (a1)+,d0
or.l d3,d0
bne.s idct1_no_ac0
move.w (a0),d0
lsl.w #PASS1_BITS,d0
move.w d0,d1
swap d0
move.w d1,d0
REPT 4
move.l d0,(a0)+
ENDR
bra idct1_next
idct1_no_ac0:
move.w d2,d1 ; 2
add.w d3,d1 ; 6
muls #FIX_0_541196100,d1
muls #_FIX_1_847759065,d3
add.l d1,d3
muls #FIX_0_765366865,d2
add.l d1,d2
move.w (a0),d0
ext.l d0 ; 0
ext.l d4 ; 4
move.l d0,d5
sub.l d4,d5
add.l d0,d4
lsl.l #5,d4
lsl.l #5,d5
addq.l #1<<2,d4
addq.l #1<<2,d5
lsl.l #8,d4
lsl.l #8,d5
lea 12(sp),a1 ; top + 2 longs -> 16 bytes platz auf dem stack
move.l d4,d0
add.l d2,d4
move.l d4,(a1)+ ; tmp10
sub.l d2,d0
move.l d5,d1
add.l d3,d5
move.l d5,(a1)+ ; tmp11
sub.l d3,d1
move.l d1,(a1)+ ; tmp12
move.l d0,(a1)+ ; tmp13
odd_part1:
move.w 7*2(a0),d1 ;7
beq o0xxx
o1xxx: move.w 5*2(a0),d2 ;5
beq o10xx
o11xx: move.w 3*2(a0),d3 ;3
beq o110x
o111x: move.w 1*2(a0),d4 ;1
bne.s odd1_1111
; 7531
odd1_1110:
move.w d2,d6
move.w d1,d0
moveq #0,d4
bra.s abk_2
; 7531
odd1_1111:
move.w d2,d6
add.w d4,d6
move.w d1,d0
add.w d4,d0
muls #FIX_1_501321110,d4
abk_2: move.w d1,d5
add.w d3,d5
move.w d5,d7
add.w d6,d7
muls #FIX_1_175875602,d7
muls #_FIX_1_961570560,d5
muls #_FIX_0_390180644,d6
add.l d7,d5
add.l d7,d6
move.w d2,d7
add.w d3,d7
muls #FIX_0_298631336,d1
muls #FIX_2_053119869,d2
muls #FIX_3_072711026,d3
muls #_FIX_0_899976223,d0
muls #_FIX_2_562915447,d7
add.l d0,d1
add.l d7,d2
add.l d5,d1
add.l d6,d2
add.l d3,d5
add.l d4,d6
add.l d7,d5
add.l d0,d6
jmp (a5)
o2110x: move.w 1*DCTSIZE*2(a0),d4 ;1
bne.s odd1_1101
; 7531
odd1_1100:
move.w d2,d6
move.w d1,d3
moveq #0,d4
bra.s abk_3
o110x: move.w 1*2(a0),d4 ;1
beq.s odd1_1100
; 7531
odd1_1101:
move.w d2,d6
move.w d1,d3
add.w d4,d6
add.w d4,d3
muls #FIX_1_501321110,d4
abk_3:
move.w d1,d5
move.w d5,d7
add.w d6,d7
muls #FIX_1_175875602,d7
muls #_FIX_1_961570560,d5
muls #_FIX_0_390180644,d6
add.l d7,d5
add.l d7,d6
move.w d2,d0
muls #FIX_0_298631336,d1
muls #FIX_2_053119869,d2
muls #_FIX_0_899976223,d3
muls #_FIX_2_562915447,d0
add.l d3,d1
add.l d0,d2
add.l d5,d1
add.l d6,d2
add.l d4,d6
add.l d0,d5
add.l d3,d6
jmp (a5)
o10xx: move.w 3*2(a0),d3 ;3
beq o100x
o101x: move.w 1*2(a0),d4 ;1
beq.s odd1_1010
; 7531
odd1_1011:
move.w d1,d5
add.w d3,d5
move.w d1,d0
move.w d4,d6
add.w d4,d0
muls #FIX_1_501321110,d4
move.w d5,d7
add.w d6,d7
muls #_FIX_0_390180644,d6
abk_4: muls #FIX_1_175875602,d7
muls #_FIX_1_961570560,d5
add.l d7,d6
add.l d7,d5
move.w d3,d7
muls #FIX_0_298631336,d1
muls #FIX_3_072711026,d3
muls #_FIX_0_899976223,d0
muls #_FIX_2_562915447,d7
add.l d0,d1
move.l d6,d2
add.l d5,d1
add.l d7,d2
add.l d3,d5
add.l d4,d6
add.l d7,d5
add.l d0,d6
jmp (a5)
o210xx: move.w 3*DCTSIZE*2(a0),d3 ;3
beq o2100x
o2101x: move.w 1*DCTSIZE*2(a0),d4 ;1
bne.s odd1_1011
; 7531
odd1_1010:
move.w d1,d5
add.w d3,d5
move.w d1,d0
moveq #0,d4
move.w d5,d7
moveq #0,d6
bra.s abk_4
o100x: move.w 1*2(a0),d4 ;1
beq.s odd1_1000
; 7531
odd1_1001:
move.w d1,d0
add.w d4,d0
move.w d1,d5
move.w d4,d6
move.w d0,d7
muls #FIX_1_175875602,d7
muls #_FIX_1_961570560,d5
muls #_FIX_0_390180644,d6
add.l d7,d5
add.l d7,d6
muls #FIX_0_298631336,d1
muls #FIX_1_501321110,d4
muls #_FIX_0_899976223,d0
add.l d0,d1
add.l d5,d1
move.l d6,d2
add.l d4,d6
add.l d0,d6
jmp (a5)
o2100x: move.w 1*DCTSIZE*2(a0),d4 ;1
bne.s odd1_1001
; 7531
odd1_1000:
move.w d1,d2
move.w d1,d5
move.w d1,d6
muls #FIX_1_175875602,d2
muls #FIX_1_175875602+_FIX_0_899976223,d6
muls #FIX_1_175875602+_FIX_1_961570560,d5
muls #FIX_1_175875602+_FIX_0_899976223+_FIX_1_961570560+FIX_0_298631336,d1
jmp (a5)
o0xxx: move.w 5*2(a0),d2 ;5
beq o00xx
o01xx: move.w 3*2(a0),d3 ;3
beq o010x
o011x: move.w 1*2(a0),d4 ;1
beq.s odd1_0110
; 7531
odd1_0111: ; opt8
move.w d2,d6
add.w d4,d6
move.w d4,d1
muls #FIX_1_501321110,d4
muls #_FIX_0_899976223,d1
abk_1: move.w d2,d0
add.w d3,d0
move.w d3,d5
move.w d5,d7
add.w d6,d7
muls #FIX_1_175875602,d7
muls #_FIX_1_961570560,d5
muls #_FIX_0_390180644,d6 ; ???? 2
add.l d7,d5
add.l d7,d6
muls #FIX_2_053119869,d2
muls #FIX_3_072711026,d3
muls #_FIX_2_562915447,d0
add.l d0,d2
add.l d6,d2
add.l d4,d6
add.l d1,d6
add.l d5,d1
add.l d3,d5
add.l d0,d5
jmp (a5)
o20xxx: move.w 5*DCTSIZE*2(a0),d2 ;5
beq o200xx
o201xx: move.w 3*DCTSIZE*2(a0),d3 ;3
beq.s o2010x
o2011x: move.w 1*DCTSIZE*2(a0),d4 ;1
bne.s odd1_0111
; 7531
odd1_0110:
move.w d2,d6
moveq.l #0,d1
moveq.l #0,d4
bra.s abk_1
o010x: move.w 1*2(a0),d4 ;1
beq.s odd1_0100
; 7531
odd1_0101:
move.w d2,d6
move.w d2,d7
add.w d4,d6
move.w d4,d1
move.w d6,d5
muls #FIX_1_175875602,d5
muls #_FIX_0_390180644+FIX_1_175875602,d6
muls #FIX_2_053119869+_FIX_2_562915447,d2
muls #FIX_1_501321110,d4
muls #_FIX_0_899976223,d1
muls #_FIX_2_562915447,d7
add.l d6,d2
add.l d1,d6
add.l d5,d1
add.l d7,d5
add.l d4,d6
jmp (a5)
o2010x: move.w 1*DCTSIZE*2(a0),d4 ;1
bne.s odd1_0101
; 7531
odd1_0100:
move.w d2,d6
move.w d2,d1
move.w d2,d5
muls #FIX_1_175875602,d1
muls #FIX_1_175875602+_FIX_2_562915447,d5
muls #FIX_1_175875602+_FIX_0_390180644,d6
muls #FIX_1_175875602+_FIX_2_562915447+_FIX_0_390180644+FIX_2_053119869,d2
jmp (a5)
o00xx: move.w 3*2(a0),d5 ;3
beq.s o000x
o001x: move.w 1*2(a0),d4 ;1
beq.s odd1_0010
; 7531
odd1_0011: ; opt12
move.w d5,d2
move.w d5,d3
move.w d4,d1
move.w d4,d6
move.w d3,d7
add.w d4,d7
muls #FIX_1_175875602,d7
muls #_FIX_1_961570560,d5
muls #_FIX_0_390180644,d6
add.l d7,d5
add.l d7,d6
muls #_FIX_2_562915447+FIX_3_072711026,d3
muls #_FIX_0_899976223+FIX_1_501321110,d4
muls #_FIX_0_899976223,d1
muls #_FIX_2_562915447,d2
add.l d5,d1
add.l d6,d2
add.l d3,d5
add.l d4,d6
jmp (a5)
o200xx: move.w 3*DCTSIZE*2(a0),d5 ;3
beq o2000x
o2001x: move.w 1*DCTSIZE*2(a0),d4 ;1
bne.s odd1_0011
; 7531
odd1_0010:
move.w d5,d6
move.w d5,d2
move.w d5,d1
muls #FIX_1_175875602,d6
muls #FIX_1_175875602+_FIX_2_562915447,d2
muls #FIX_1_175875602+_FIX_1_961570560,d1
muls #FIX_1_175875602+_FIX_2_562915447+_FIX_1_961570560+FIX_3_072711026,d5
jmp (a5)
o000x: move.w 1*2(a0),d6 ;1
beq.s odd1_0000
; 7531
odd1_0001: ; opt 14
move.w d6,d5
move.w d6,d1
move.w d6,d2
muls #FIX_1_175875602,d5
muls #FIX_1_175875602+_FIX_0_899976223,d1
muls #FIX_1_175875602+_FIX_0_390180644,d2
muls #FIX_1_175875602+_FIX_0_899976223+_FIX_0_390180644+FIX_1_501321110,d6
jmp (a5)
; priority: 14 12 8 0
; 7531
odd1_0000:
moveq #CONST_BITS-PASS1_BITS,d7 ; optimized compose !
lea 12(sp),a1
move.l (a1)+,d0 ; tmp10
lsl.l #16-(CONST_BITS-PASS1_BITS),d0
move.l (a1)+,d1 ; tmp11
lsr.l d7,d1
move.w d1,d0
move.l (a1)+,d2 ; tmp12
lsl.l #16-(CONST_BITS-PASS1_BITS),d2
move.l (a1)+,d3 ; tmp13
lsr.l d7,d3
move.w d3,d2
move.l d0,(a0)+
swap d0
move.l d2,(a0)+
swap d2
move.l d2,(a0)+
move.l d0,(a0)+
move.l (sp)+,d7
dbra d7,idct1
bra.s idct1_ready
; keep 1 2 5 6
compose1: moveq #CONST_BITS-PASS1_BITS,d7
lea 12(sp),a1
move.l (a1)+,d4 ; tmp10
sub.l d6,d4
add.l d6,d6
add.l d4,d6
lsl.l #16-(CONST_BITS-PASS1_BITS),d6
move.l (a1)+,d3 ; tmp11
sub.l d5,d3
add.l d5,d5
add.l d3,d5
lsr.l d7,d5
move.w d5,d6
move.l d6,(a0)+
move.l (a1)+,d6 ; tmp12
sub.l d2,d6
add.l d2,d2
add.l d6,d2
lsl.l #16-(CONST_BITS-PASS1_BITS),d2
move.l (a1)+,d5 ; tmp13
sub.l d1,d5
add.l d1,d1
add.l d5,d1
lsr.l d7,d1
move.w d1,d2
move.l d2,(a0)+
lsl.l #16-(CONST_BITS-PASS1_BITS),d5
lsr.l d7,d6
move.w d6,d5
move.l d5,(A0)+
lsl.l #16-(CONST_BITS-PASS1_BITS),d3
lsr.l d7,d4
move.w d4,d3
move.l d3,(a0)+
idct1_next:
move.l (sp)+,d7
dbra d7,idct1
idct1_ready:
; *******************************************************
move.l (sp)+,a0
lea compose2(pc),a5
moveq #DCTSIZE-1,d7
idct2: move.l d7,-(sp)
odd_part2:
move.w 7*DCTSIZE*2(a0),d1 ;7
beq o20xxx
o21xxx: move.w 5*DCTSIZE*2(a0),d2 ;5
beq o210xx
o211xx: move.w 3*DCTSIZE*2(a0),d3 ;3
beq o2110x
o2111x: move.w 1*DCTSIZE*2(a0),d4 ;1
beq odd1_1110
bra odd1_1111
o2000x: move.w 1*DCTSIZE*2(a0),d6 ;1
bne odd1_0001
odd0_0000:
move.w 2*DCTSIZE*2(a0),d2
move.w 4*DCTSIZE*2(a0),d4
move.w 6*DCTSIZE*2(a0),d3
move.w d2,d0
add.w d3,d0
muls #FIX_0_541196100/4,d0
muls #_FIX_1_847759065/4,d3
add.l d0,d3
muls #FIX_0_765366865/4,d2
add.l d0,d2
move.w (a0),d0
add.w #1<<(PASS1_BITS+3-1),d0 ; precalc from the descaling part below
ext.l d4
ext.l d0
move.l d0,d5
sub.l d4,d5
add.l d0,d4
moveq #CONST_BITS-2,d0
lsl.l d0,d4
lsl.l d0,d5
move.l d4,d0
add.l d2,d4
swap d4
move.w d4,(a0)+
sub.l d2,d0
move.w d4,7*DCTSIZE*2-2(a0)
swap d0
move.w d0,3*DCTSIZE*2-2(a0)
move.l d5,d4
move.w d0,4*DCTSIZE*2-2(a0)
add.l d3,d5
swap d5
sub.l d3,d4
move.w d5,1*DCTSIZE*2-2(a0)
swap d4
move.w d5,6*DCTSIZE*2-2(a0)
move.w d4,2*DCTSIZE*2-2(a0)
move.w d4,5*DCTSIZE*2-2(a0)
move.l (sp)+,d7
dbra d7,idct2
bra idct2_ready
compose2:
move.w 2*DCTSIZE*2(a0),d3
move.w 4*DCTSIZE*2(a0),d4
move.w 6*DCTSIZE*2(a0),d7
move.w d3,d0
add.w d7,d0
muls #FIX_0_541196100,d0
muls #_FIX_1_847759065,d7
add.l d0,d7
muls #FIX_0_765366865,d3
add.l d0,d3
asr.l #2,d7
asr.l #2,d3
move.l d7,a3
move.w (a0),d0
add.w #1<<(PASS1_BITS+3-1),d0 ; precalc from the descaling part below
ext.l d4
ext.l d0
move.l d0,d7
sub.l d4,d7
add.l d0,d4
moveq #CONST_BITS-2,d0
lsl.l d0,d4
lsl.l d0,d7
asr.l #2,d6
asr.l #2,d5
asr.l #2,d2
asr.l #2,d1
move.l d4,d0
add.l d3,d4
sub.l d3,d0
move.l d7,d3
add.l a3,d7
sub.l a3,d3
sub.l d6,d4
add.l d6,d6
add.l d4,d6
swap d6 ; moveq #CONST_BITS+PASS1_BITS+3 -2 ,d6 ; asr.l d6,d3
move.w d6,(a0)+
swap d4
move.w d4,7*DCTSIZE*2-2(a0)
sub.l d1,d0
add.l d1,d1
add.l d0,d1
swap d1
move.w d1,3*DCTSIZE*2-2(a0)
swap d0
move.w d0,4*DCTSIZE*2-2(a0)
sub.l d5,d7
add.l d5,d5
add.l d7,d5
swap d5
move.w d5,1*DCTSIZE*2-2(a0)
swap d7
move.w d7,6*DCTSIZE*2-2(a0)
sub.l d2,d3
add.l d2,d2
add.l d3,d2
swap d2
move.w d2,2*DCTSIZE*2-2(a0)
swap d3
move.w d3,5*DCTSIZE*2-2(a0)
idct2_next:
move.l (sp)+,d7
dbra d7,idct2
idct2_ready;
add.w #16,sp
; movem.l (sp)+,JREVDCTREGS
movem.l (sp)+,ri_regs
rts
XDEF @j_rev_dct
@j_rev_dct:
movem.l ri_regs,-(sp)
bra jrevdct
ifeq 1
; **************************************************************************
; Pre compute singleton coefficient IDCT values.
;
; void init_pre_idct(void)
; XDEF @init_pre_idct
@init_pre_idct:
movem.l d2/a2,-(sp)
lea PreIDCT,a2
move.w #64*64/4/4-1,d2
preidctclr:
clr.l (a2)+
clr.l (a2)+
clr.l (a2)+
clr.l (a2)+
dbra d2,preidctclr
lea PreIDCT+63*64*2,a2
moveq #63,d2
preidctloop:
move.w #2048,(a2,d2.w)
move.l a2,a0
bsr @j_rev_dct
sub.w #64,a2
dbra d2,preidctloop
movem.l (sp)+,d2/a2
rts
; ************************************************************************************
; Perform the inverse DCT on one block of coefficients.
;
; void j_rev_dct_sparse (DCTBLOCK data, int pos)
; XDEF @j_rev_dct_sparse
@j_rev_dct_sparse:
tst.l d0
bne itsnotthedc
; the single element to cope with is the dc coefficient
move.w (a0),d1
bpl.s scale_dc
subq.w #3+4,d1 ; "implement" the rounding error
scale_dc:addq.w #4,d1
asr.w #3,d1
move.w d1,d0 ; extend to longword
swap d0
move.w d1,d0
moveq #7,d1
set_dc: move.l d0,(a0)+
move.l d0,(a0)+
move.l d0,(a0)+
move.l d0,(a0)+
dbra d1,set_dc
rts ; not that pretty
; bra exit_jrds
itsnotthedc:
movem.l d2/d3,-(sp)
; Some other coefficient.
move.w (a0,d0.w),d1 ; get coeff
lea PreIDCT,a1 ; get precalculated DCT
lsl.l #7,d0
add.l d0,a1
moveq #CONST_BITS-PASS1_BITS-8,d3 ; scale down
moveq.l #31,d0
set_ac: move.w d1,d2
muls (a1)+,d2
lsr.l d3,d2
move.w d2,(a0)+
move.w d1,d2
muls (a1)+,d2
lsr.l d3,d2
move.w d2,(a0)+
dbra d0,set_ac
movem.l (sp)+,d2/d3
exit_jrds: rts
; ************************************************************************************
section bss,BSS
;
; Precomputed idct value arrays
;
PreIDCT: ds.w 64*64
endc
; END